This notebook describes a outlier/novelty detection session on Danish company data.
It requires a comma-separated values file with features extracted from a JSONL file.
In [1]:
# https://github.com/fnielsen/everything
from everything import *
In [2]:
# Read dataframe with features for companies
filename = expanduser('~/workspace/cvrminer/virksomheder-features.csv')
df = read_csv(filename, encoding='utf-8', index_col=0)
In [3]:
# Feature names
df.columns
Out[3]:
In [4]:
# Functions for conversion to numerical dataframes
def to_dummies(df, column):
datatype = df[column].dtypes
if datatype in [int64, float64]:
return df[[column]]
elif datatype == bool:
return df[[column]].astype(int)
elif datatype == 'object':
df_column = df[column].str.get_dummies()
df_column.columns = [column + ":" + col for col in df_column.columns]
return df_column
else:
raise ValueError('Unrecognized datatype for column {}'.format(column))
def dataframe_to_numerical(df):
df_numerical = DataFrame(index=df.index)
for column in df.columns:
print(column)
df_numerical = df_numerical.join(to_dummies(df, column))
return df_numerical
In [5]:
# Numerical dataframe
dfn = dataframe_to_numerical(df)
dfn.shape
Out[5]:
In [6]:
dfn.describe()
Out[6]:
In [7]:
# Preprocessing
imputer = Imputer()
scaler = StandardScaler(with_mean=False)
dfni = scaler.fit_transform(imputer.fit_transform(dfn))
dfni = imputer.fit_transform(dfn)
In [8]:
# Outlier detection/novelty detection with K-means clustering
clusterer = MiniBatchKMeans(n_clusters=8, random_state=1, verbose=False)
clusterer.fit(dfni)
Out[8]:
In [9]:
distances = sum((dfni - clusterer.cluster_centers_[clusterer.labels_, :]) ** 2, axis=1)
indices_clusterer = argsort(-distances)
In [10]:
df.iloc[indices_clusterer[:150], :]
Out[10]:
In [11]:
# Plot histogram for distances related to the company with the largest distance
index_for_max = indices_clusterer[0]
cluster_label = clusterer.labels_[index_for_max]
hist((distances[clusterer.labels_ == cluster_label]), bins=np.logspace(0, 8, 100))
max_value = distances[index_for_max]
ax = gca()
ax.set_xscale('log')
ax.set_yscale('symlog')
ax.annotate('Max distance', xy=(max_value, 5), xytext=(max_value, 20000), arrowprops=dict(facecolor='black'))
xlabel('Distance to cluster center for cluster label {}'.format(cluster_label))
ylabel('Absolute frequency')
title('Histogram of distances for cluster {}'.format(cluster_label))
show()
In [12]:
feature_distances_for_all = (dfni - clusterer.cluster_centers_[clusterer.labels_, :]) ** 2
In [13]:
index_of_interest = index_for_max
index_of_interest = df.index.get_loc(56994912) # SAS Danmark A/S
df.iloc[index_of_interest, :]
Out[13]:
In [14]:
# Bar plot of feature distances for a company
feature_distance = Series(feature_distances_for_all[index_of_interest, :], index=dfn.columns)
feature_distance.sort_values(inplace=True, ascending=False)
feature_distance.iloc[:20][::-1].plot(kind='barh')
ax = gca()
pos = ax.get_position()
pos.x0 = 0.6
ax.set_position(pos)
ax.set_xscale('log')
xlabel('Feature distance')
title('Feature distances for most novel company')
show()
In [15]:
# Write results to an HTML file
filename = expanduser('~/workspace/cvrminer/virksomheder-report.html')
with codecs.open(filename, 'w', encoding='utf-8') as f:
f.write("""
<html>
<head>
<title>Virksomheder report</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
</head>
<body>
<h1>Virksomheder report</h1>
""")
f.write("""<h2>Numerical features dataframe summary statistics</h2>""")
f.write(dfn.describe().T.to_html())
f.write("""<h2>Novelty from k-means</h2>""")
f.write(dfn.iloc[indices_clusterer[:50], :].to_html(
escape=False,
formatters={'__index__':
lambda idx: '<a href="http://datacvr.virk.dk/data/visenhed?enhedstype=virksomhed&id={}">{}</a>'.format(
idx, idx)}))
f.write("""
</body>
</html>""")
In [ ]:
In [16]:
# Investigate cluster model as a function of number of clusters
inertias = []
for n_clusters in range(1, 50):
clusterer = MiniBatchKMeans(n_clusters=n_clusters, max_iter=200, max_no_improvement=30, n_init=10)
clusterer.fit(dfni)
inertias.append(clusterer.inertia_)
In [17]:
plot(inertias)
show()
In [ ]: